// Data Preparation for "The Impact of Open Access Mandates..." Bryan/Ozcan RESTAT 2020
// STATA SE 15
// See Data Appendix online for further details on raw data

// Raw Article Data downloaded from PubMed
// Raw Patent Data from USPTO

// Change working directory here if needed



// Sept2015FirstAuthorLocation includes state and country of first author for all articles in sample
insheet using Sept2015FirstAuthorLocation.csv, clear
sort pubmedid
save Sept2015FirstAuthorLocation, replace

// Sept2015ListofPapers includes covariates extracted from PubMed using XML queries, including "free full text"
//   and "pubmed central" dummies for open access as of that date, the date the article was on PMC, the forward
//   academic citations, the journal, and a tag for NIH funding
insheet using Sept2015Listofpapers.csv, clear
sort pubmedid
save Sept2015Listofpapers, replace

// Nov2017FrontPageMatches gives a list of front page patent citations to each article in our sample, matched using 
//  the algorithm in the online appendix.   
insheet using Nov2017FrontPageMatches.csv, clear
sort patentpubnum pubmedid
// 80061 front page matches in granted patents
quietly by patentpubnum pubmedid:  gen dup = cond(_N==1,0,_n)
// after dropping duplicates, 75686 remain
sum dup
// Drop if patent cited more than once in a single application
drop if dup>1
save Nov2017FrontPageMatchesStata, replace

// Sept2015RawMatches is a list of in-text matches from patents to academic papers using the algorithm in the online
//  appendix.
insheet using Sept2015RawMatches.csv, clear
sort patentpubnum pubmedid
quietly by patentpubnum pubmedid:  gen dup = cond(_N==1,0,_n)
sum dup
// Drop if patent cited more than once in a single application
drop if dup>1
// Drop 4 entries with application year before 2005
drop if appyear<2005
save Sept2015RawMatchesStata, replace

// Sept2017PatentCharacteristics_wForwCit is a list of patent covariates for all patents in the previous match, including
//  forward citations
insheet using Sept2017PatentCharacteristics_wForwCit.csv, clear
sort patentpubnum
save Sept2017PatentCharacteristicsStata, replace


// ----------------------------- merge patent characteristics with list of raw matches ---------------------------
merge 1:m patentpubnum using Sept2015RawMatchesStata
// 8 unmatched from Master are the 4 which we deleted by hand from match file b/c of data entry error 
//   plus the 4 which we deleted because app date was prior to 2005
// For now, drop these
drop if pubmedid==.
// Some matches using 2004 pub articles which had Create Date after 1/1/2005, drop these as well
drop if publicationyear==2004
// Drop if patent is a child of any kind
drop if pat_is_child_all==1 
// generate binary for match if patent app date and pub date less than three years apart
gen threeyrcite=0
replace threeyrcite=1 if lagbetweenpubandapp<=1095
// generate dummies for matches with 3+, 5+, 2 or fewer inventors
gen threeormoreinventors=0
replace threeormoreinventors=1 if no_of_inventors>=3
gen fiveormoreinventors=0
replace fiveormoreinventors=1 if no_of_inventors>=5
gen twoorfewerinventors=0
replace twoorfewerinventors=1 if no_of_inventors<=2
// Call patent assigned if assigned to any of our categories
gen assigned=0
replace assigned=1 if patent_is_assigned==1
gen unassigned=0
replace unassigned=1 if assigned==0
gen threeyrassignedcite=0
// Bigfamily==1 if patent was applied for in more than one patent office
gen bigfamily=0
replace bigfamily=1 if family_size>1
gen smallfamily=0
replace smallfamily=1 if bigfamily==0
replace threeyrassignedcite=1 if (lagbetweenpubandapp<=1095)&(assigned==1)
gen threeyrunassignedcite=0
replace threeyrunassignedcite=1 if (lagbetweenpubandapp<=1095)&(unassigned==1)
// Denote US invented patent cites
gen usinventor=0
replace usinventor=1 if inventor_country=="US"
// count unique patents and total cites: 28136 patents with 63106 cites
// Note some entries missing data on number of inventors, etc.
by patentpubnum, sort: gen nvals = _n == 1 

/* BOTTOM PANEL OF TABLE 1 */
count
count if nvals
sum no_of_inventors assigned assignee_corp assignee_univ assignee_govt assignee_indiv assignee_major_biotech_medical inventors_multiple_country has_further_class usinventor eventually_granted_dummy family_size granted_2016 granted_2016_and_assigned granted_assignee_same_as_app bigfamily assigned_in_grant_or_app total_citedby_patent_count if nvals
// Save patent file
sort pubmedid
save Sept2015MergedPatentData, replace
drop _merge

// Merge in author characteristics to raw matches data
merge m:1 pubmedid using Sept2015FirstAuthorLocation
drop if patentpubnum==.
// Calculate same country or state in first author and first inventor
gen samestate=0
replace samestate=1 if (inventor_state==articlestatecode&inventor_country=="US")
gen diffstate=0
replace diffstate=1 if samestate==0
gen samecountry=0
replace samecountry=1 if (inventor_country==articlecountrycode)
gen diffcountry=0
replace diffcountry=1 if samecountry==0
gen sameregion=0
replace sameregion=1 if (samestate==1&articlecountrycode=="US")|(samecountry==1&articlecountrycode!="US")
gen diffregion=0
replace diffregion=1 if sameregion==0
// Check percentage of citing patents from same state/country/state or country as the paper they cite
sum sameregion samestate samecountry if nvals
// Calculate number of pats with certain characteristics
collapse (sum) grantedcites=eventually_granted_dummy weightedcites=total_citedby_patent_count assignedcites=assigned unassignedcites=unassigned threeplusinv=threeormoreinventors fiveplusinv=fiveormoreinventors twominusinv=twoorfewerinventors multiclasspat=has_further_class grantedpat=eventually_granted_dummy granted2016=granted_2016 twoormorefamilysize=bigfamily familysizeone=smallfamily unassignedpat=unassigned assignedpat=assigned univpat=assignee_univ corppat=assignee_corp govtpat=assignee_govt hosppat=assignee_other_hospitals indivpat=assignee_individual majorbiotechpat=assignee_major_biotech_medical majorbiotechpatbroad=assignee_major_biotech_medical_b multicountrypat=inventors_multiple_country samestatepat=samestate samecountrypat=samecountry sameregionpat=sameregion diffstatepat=diffstate diffcountrypat=diffcountry diffregionpat=diffregion patischildparentinsample=pat_is_child_parent_is_insample patischild=pat_is_child_all threeyrcites=threeyrcite threeyrassignedcites=threeyrassignedcite threeyrunassignedcites=threeyrunassignedcite numberofinventors=no_of_inventors usinventordummy=usinventor (count) totalcites=patentpubnum, by(pubmedid)
merge 1:m pubmedid using Sept2015Listofpapers
// drop handful of publications with monthtag>Dec2012
rename monthtagjan20051 monthtag
drop if monthtag>96
sum totalcites familysizeone twoormorefamilysize granted2016
// Make zero cites if unmatched pubmedid
replace totalcites=0 if totalcites==.
replace weightedcites=0 if weightedcites==.
replace grantedcites=0 if grantedcites==.
replace granted2016=0 if granted2016==.
replace twoormorefamilysize=0 if twoormorefamilysize==.
replace familysizeone=0 if familysizeone==.
replace assignedcites=0 if assignedcites==.
replace unassignedcites=0 if unassignedcites==.
replace threeyrcites=0 if threeyrcites==.
replace threeyrassignedcites=0 if threeyrassignedcites==.
replace threeyrunassignedcites=0 if threeyrunassignedcites==.
replace threeplusinv=0 if threeplusinv==.
replace fiveplusinv=0 if fiveplusinv==.
replace twominusinv=0 if twominusinv==.
replace univpat=0 if univpat==.
replace corppat=0 if corppat==.
replace govtpat=0 if govtpat==.
replace hosppat=0 if hosppat==.
replace indivpat=0 if indivpat==.
replace assignedpat=0 if assignedpat==.
replace unassignedpat=0 if unassignedpat==.
replace grantedpat=0 if grantedpat==.
replace multiclasspat=0 if multiclasspat==.
replace majorbiotechpat=0 if majorbiotechpat==.
replace majorbiotechpatbroad=0 if majorbiotechpatbroad==.
replace multicountrypat=0 if multicountrypat==.
replace samestatepat=0 if samestatepat==.
replace samecountrypat=0 if samecountrypat==.
replace sameregionpat=0 if sameregionpat==.
replace diffstatepat=0 if diffstatepat==.
replace diffcountrypat=0 if diffcountrypat==.
replace diffregionpat=0 if diffregionpat==.
drop _merge
merge 1:m pubmedid using Sept2015FirstAuthorLocation
save Sept2015Crosssectiondataset, replace
